/* 
This do file creates a list of establishments in our amenities sample, with tags for the following: 
- ever in RAIS 
- single establishment firms 
- treatment status of estab 

It is used as a link between our amenities data and the RAIS data 

		//input: estab_union_act, contracts_cnes, contracts_act, act, estabchars_wide, rais_20102017, rais_estabid_fakeid_link_20200819_combined
		//output: estabtreat_forRAIS_deid

*/

cap log close
cap log using "$logs/link_amenities_rais", replace 

* Load the amenities sample
use "$files/estab_union_act.dta", clear 
keep if ind_mode!=. & microregion_mode!=.
keep if treat!=. 
keep if inrange(year, 2011, 2017)

// Indicator for whether estab has a pre-period contract and a post-period contract
xtset estab_union_pair  year
drop post
gen post = (year>=2015)
gen pre = (year<2015)

egen tag_employer = tag(employer_id)
egen tag_estab_union_pair = tag(estab_union_pair)
bys employer_id: egen haveprecontract=max(pre)
lab var haveprecontract "Have a pre-period contract (2011-2014)"
tab haveprecontract if tag_employer==1
tab haveprecontract if tag_estab_union_pair==1

bys employer_id: egen havepostcontract=max(post)
lab var havepostcontract "Have a post-period contract (2015-2017)"
tab havepostcontract if tag_employer==1
tab havepostcontract if tag_estab_union_pair==1

* Assign treatment status on any pairs of coverage
gen treat_pair = treat
egen maxt = max(treat_pair), by(employer_id)
egen mint = min(treat_pair), by(employer_id)
mdesc maxt mint

// Treatment at the estab (rather than pair level)
keep employer_id union_id maxt mint haveprecontract havepostcontract only1union
egen min_union = mode(union_id), by(employer_id) minmode
egen max_union = mode(union_id), by(employer_id) maxmode
duplicates drop 
isid employer_id union_id 
//always treated
unique(employer_id) if maxt==1 & mint==1 
//always control
unique(employer_id) if maxt==0 & mint==0 
//sometimes treated (will consider treated)
unique(employer_id) if maxt==1 & mint==0 
gen treat = maxt
gen disputet= (maxt==1 & mint==0)
lab var treat "Treatment status of establishment"
lab var disputet "Sign with some treated and some untreated unions"

//assign a union to each employer (choose randomly when more than one exists)
drop union_id
gen rrr = runiform()
egen xxx = max(rrr), by(employer_id)
gen union_id = min_union if xxx<0.5
replace union_id = max_union if xxx>=0.5

keep employer_id union_id treat disputet only1union haveprecontract havepostcontract
order employer_id union_id treat disputet only1union haveprecontract havepostcontract
duplicates drop
isid employer_id 
tab treat

tempfile temp
save `temp'

* Get the CBAs closest to the reform for our sample
use "$files/unions/contracts_cnes.dta", clear
keep contract_id employer_id union_id 
merge m:1 contract_id using "$files/contracts_act.dta"
keep if _merge == 3
drop _merge 
//keep "closest" CBA
keep contract_id employer_id year
gen yyy = abs(year-2015)
egen xxx = min(yyy), by(employer_id)
keep if yyy==xxx
sort employer_id year contract_id, stable
egen xxx2 = tag(employer_id)
keep if xxx2==1
keep contract_id employer_id
duplicates drop
//focus only on the CBAs in our sample
merge m:1 employer_id using `temp', keepusing(employer_id)
keep if _merge==3
drop _merge
//choose the smallest contract_id among those signed in the year closest to the reform
sort employer_id contract_id, stable
bys employer_id: keep if _n==1
tempfile contract
save `contract'
	
* Merge in the CBA identifiers 
use `temp', clear
merge 1:1 employer_id using `contract'
drop _merge
//number of signing establishments
count
tempfile temp
save `temp'

* Get the establishments covered by the "closest CBA"
use `contract', clear
keep contract_id
duplicates drop
merge 1:m contract_id using "$files/unions/act.dta", keepusing(estabid)
drop if _merge==2
tab _merge 
drop _merge
tempfile contract
save `contract'	
	
* Merge in the CBA identifiers 
use `temp', clear
joinby contract_id using `contract', unmatched(master)
tab _merge
drop _merge
//get cnpj of signing establishment
gen cnpj_sign=subinstr(employer_id, ".", "", .)
replace cnpj_sign=subinstr(cnpj_sign, "/", "", .)
replace cnpj_sign=subinstr(cnpj_sign, "-", "", .)
gen h = length(cnpj_sign)
replace cnpj_sign = "00"+cnpj_sign if h==12
drop h 
//get cnpj of covered establishment
tostring estabid, gen(xxx) format("%16.0g")
gen cnpj_cover = substr(14*"0",1,14-length(xxx))+xxx
gen h = length(cnpj_cover)
replace cnpj_cover = "00"+cnpj_cover if h==12
drop h xxx
//covered 
gen covered = (estabid!=.)
replace cnpj_cover = cnpj_sign if covered==0
destring cnpj_cover, gen(xxx) 
replace estabid = xxx if covered==0
drop xxx
//signing = covered
gen signing = (cnpj_sign==cnpj_cover)

* Ensure same dataset as before when restricting to signing==1
unique employer_id
unique employer_id if signing==1
//check that there is at most 1 sigining establishment per employer_id
egen xxx = sum(signing), by(employer_id)
egen yyy = tag(employer_id)
tab xxx if yyy==1
//add establishments with 0 back with signing=1 and cover=0
expand 2 if xxx==0 & yyy==1, gen(zzz)
drop xxx yyy
destring cnpj_sign, gen(xxx) 
replace estabid = xxx if zzz==1
replace cnpj_cover = cnpj_sign if zzz==1
replace cover = 0 if zzz==1
drop xxx zzz
//final check
drop signing
gen signing = (cnpj_sign==cnpj_cover)
unique employer_id
unique employer_id if signing==1
egen xxx = sum(signing), by(employer_id)
egen yyy = tag(employer_id)
tab xxx if yyy==1
drop xxx yyy cnpj_cover cnpj_sign
tab treat if signing==1

// when multiple observations exist keep 
// 1) observation where establishment is signing
// 2) if not signing: has post contract, no disputed treatment, random
gen crit1 = -signing
gen crit2 = -havepostcontract
gen crit3 = disputet
gen crit4 = runiform()
sort estabid crit1 crit2 crit3 crit4, stable
bysort estabid: keep if _n==1
drop crit*

//get variables for sample retsrictions
gen double fakeid_estab = estabid 
merge 1:1 fakeid_estab using "$files/estabchars_wide.dta", keepusing(ind_mode microregion_mode has_2014 estabidtype_mode) keep(1 3)
drop if _merge==1
drop _merge
gen todrop = (ind_mode==.)|(microregion_mode==.)|(has_2014!=1)
tab todrop
drop if todrop==1
drop todrop fakeid_estab
drop has_2014 ind_mode microregion_mode

mdesc
tab treat if signing==1
tab treat

// save (with cnpj for merging)
tostring estabid, gen(cnpj) format("%14.0f")
gen a = length(cnpj) 
* CNPJ
replace cnpj = "0"*(14-a)+cnpj if  estabidtype_mode==1
* CEI
replace cnpj = "0"*(12-a)+cnpj if  estabidtype_mode==3
drop a estabidtype_mode
tempfile establist 
save `establist'

* Establishment identifiers
use "$files/rais/BR/rais_20102017.dta", clear
keep fakeid_firm fakeid_estab estabid_type
duplicates drop
gen double estabid_cnpjcei = fakeid_estab
tostring estabid_cnpjcei, gen(cnpj) format("%14.0f")
gen a = length(cnpj) 
// CNPJ
replace cnpj = "0"*(14-a)+cnpj if  estabid_type==1
// CEI
replace cnpj = "0"*(12-a)+cnpj if  estabid_type==3

keep cnpj fakeid_firm fakeid_estab estabid_type
duplicates drop cnpj, force
	
// Get No. of establishments per firm
	* first 8 digits of CNPJ
	gen firmid=substr(cnpj, 1, 8) if estabid_type==1
	* full CEI
	replace firmid=substr(cnpj, 1, 12) if estabid_type==3
	gen n=1
	bys firmid: egen num_est=sum(n)
	drop n
	lab var num_est "Number of establishments in firm"
	sum num_est, d
	gen singest=(num_est==1)
	lab var singest "Single establishment firm"
	tab singest
	
// Save xwalk of ids with tag for num_est and singest 
	tempfile links
	save `links'
	
// merge with our amenities sample 
	use `establist', clear
	merge 1:1 cnpj using `links'
	drop if _merge==2
	ren _merge raismerge
	label var raismerge "Is in RAIS (ever)"
	
// save all 
	drop cnpj employer_id estabid_type firmid 
	save "$files/estabtreat_forRAIS_deid.dta", replace
	
// save singestab
	keep if singest == 1 
	save "$files/estabtreat_forRAIS_singest_deid", replace 

cap log close
